library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.1.2
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.1.2
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(clValid) # For selecting cluster method and number of clusters
## Loading required package: cluster
library(factoextra) # for cluster fitting and visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(uwot) # For UMAP dimensionality reduction
## Warning: package 'uwot' was built under R version 4.1.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(patchwork) # For arranging multiple plots
## Warning: package 'patchwork' was built under R version 4.1.2
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.2
set.seed(888) # To ensure consistent results from non-deterministic procedures
rm(list = ls()) # Removes all variables
compas.df = read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores.csv")
## New names:
## Rows: 11757 Columns: 47
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (19): name, first, last, sex, age_cat, race, c_case_number, c_charge_de... dbl
## (14): id, age, juv_fel_count, decile_score...12, juv_misd_count, juv_ot... lgl
## (2): num_r_cases, num_vr_cases dttm (2): c_jail_in, c_jail_out date (10):
## compas_screening_date, dob, c_offense_date, c_arrest_date, r_offe...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `decile_score` -> `decile_score...12`
## • `decile_score` -> `decile_score...45`
colnames(compas.df) = make.names(colnames(compas.df), unique = TRUE) # Ensures names are valid and don't contain spaces
compas.df %>% skimr::skim() # Check the skim results to identify missing data
Data summary
| Name |
Piped data |
| Number of rows |
11757 |
| Number of columns |
47 |
| _______________________ |
|
| Column type frequency: |
|
| character |
19 |
| Date |
10 |
| logical |
2 |
| numeric |
14 |
| POSIXct |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| name |
0 |
1.00 |
6 |
29 |
0 |
11584 |
0 |
| first |
0 |
1.00 |
2 |
13 |
0 |
4058 |
0 |
| last |
0 |
1.00 |
1 |
20 |
0 |
5921 |
0 |
| sex |
0 |
1.00 |
4 |
6 |
0 |
2 |
0 |
| age_cat |
0 |
1.00 |
7 |
15 |
0 |
3 |
0 |
| race |
0 |
1.00 |
5 |
16 |
0 |
6 |
0 |
| c_case_number |
742 |
0.94 |
13 |
13 |
0 |
11015 |
0 |
| c_charge_degree |
0 |
1.00 |
1 |
1 |
0 |
3 |
0 |
| c_charge_desc |
749 |
0.94 |
5 |
52 |
0 |
531 |
0 |
| r_case_number |
8054 |
0.31 |
13 |
13 |
0 |
3703 |
0 |
| r_charge_degree |
0 |
1.00 |
1 |
1 |
0 |
3 |
0 |
| r_charge_desc |
8114 |
0.31 |
6 |
52 |
0 |
352 |
0 |
| vr_case_number |
10875 |
0.08 |
13 |
13 |
0 |
882 |
0 |
| vr_charge_degree |
10875 |
0.08 |
4 |
5 |
0 |
9 |
0 |
| vr_charge_desc |
10875 |
0.08 |
7 |
37 |
0 |
88 |
0 |
| v_type_of_assessment |
0 |
1.00 |
16 |
16 |
0 |
1 |
0 |
| v_score_text |
0 |
1.00 |
3 |
6 |
0 |
4 |
0 |
| type_of_assessment |
0 |
1.00 |
18 |
18 |
0 |
1 |
0 |
| score_text |
0 |
1.00 |
3 |
6 |
0 |
4 |
0 |
Variable type: Date
| compas_screening_date |
0 |
1.00 |
2013-01-01 |
2014-12-31 |
2013-12-12 |
704 |
| dob |
0 |
1.00 |
1919-10-14 |
1998-03-29 |
1983-11-14 |
7800 |
| c_offense_date |
2600 |
0.78 |
1987-11-07 |
2014-12-30 |
2013-11-25 |
1036 |
| c_arrest_date |
9899 |
0.16 |
1997-06-18 |
2014-12-31 |
2013-09-18 |
802 |
| r_offense_date |
8054 |
0.31 |
2013-01-03 |
2016-03-29 |
2014-08-27 |
1090 |
| r_jail_in |
9297 |
0.21 |
2013-01-04 |
2016-03-29 |
2014-08-21 |
984 |
| r_jail_out |
9297 |
0.21 |
2013-01-05 |
2020-01-01 |
2014-10-10 |
953 |
| vr_offense_date |
10875 |
0.08 |
2013-01-28 |
2016-03-13 |
2014-11-19 |
599 |
| v_screening_date |
0 |
1.00 |
2013-01-01 |
2014-12-31 |
2013-12-12 |
704 |
| screening_date |
0 |
1.00 |
2013-01-01 |
2014-12-31 |
2013-12-12 |
704 |
Variable type: logical
| num_r_cases |
11757 |
0 |
NaN |
: |
| num_vr_cases |
11757 |
0 |
NaN |
: |
Variable type: numeric
| id |
0 |
1.00 |
5879.00 |
3394.10 |
1 |
2940 |
5879 |
8818 |
11757 |
▇▇▇▇▇ |
| age |
0 |
1.00 |
35.14 |
12.02 |
18 |
25 |
32 |
43 |
96 |
▇▅▂▁▁ |
| juv_fel_count |
0 |
1.00 |
0.06 |
0.45 |
0 |
0 |
0 |
0 |
20 |
▇▁▁▁▁ |
| decile_score…12 |
0 |
1.00 |
4.37 |
2.88 |
-1 |
2 |
4 |
7 |
10 |
▇▇▆▆▆ |
| juv_misd_count |
0 |
1.00 |
0.08 |
0.45 |
0 |
0 |
0 |
0 |
13 |
▇▁▁▁▁ |
| juv_other_count |
0 |
1.00 |
0.09 |
0.47 |
0 |
0 |
0 |
0 |
17 |
▇▁▁▁▁ |
| priors_count |
0 |
1.00 |
3.08 |
4.69 |
0 |
0 |
1 |
4 |
43 |
▇▁▁▁▁ |
| days_b_screening_arrest |
1180 |
0.90 |
-0.88 |
72.89 |
-597 |
-1 |
-1 |
-1 |
1057 |
▁▇▁▁▁ |
| c_days_from_compas |
742 |
0.94 |
63.59 |
341.90 |
0 |
1 |
1 |
2 |
9485 |
▇▁▁▁▁ |
| is_recid |
0 |
1.00 |
0.25 |
0.56 |
-1 |
0 |
0 |
1 |
1 |
▁▁▇▁▅ |
| r_days_from_arrest |
9297 |
0.21 |
20.41 |
74.35 |
-1 |
0 |
0 |
1 |
993 |
▇▁▁▁▁ |
| is_violent_recid |
0 |
1.00 |
0.08 |
0.26 |
0 |
0 |
0 |
0 |
1 |
▇▁▁▁▁ |
| v_decile_score |
0 |
1.00 |
3.57 |
2.50 |
-1 |
1 |
3 |
5 |
10 |
▇▇▆▃▂ |
| decile_score…45 |
0 |
1.00 |
4.37 |
2.88 |
-1 |
2 |
4 |
7 |
10 |
▇▇▆▆▆ |
Variable type: POSIXct
| c_jail_in |
1180 |
0.9 |
2013-01-01 01:31:55 |
2016-03-11 10:26:16 |
2013-12-12 03:14:03 |
10577 |
| c_jail_out |
1180 |
0.9 |
2013-01-02 01:12:01 |
2020-01-01 00:00:00 |
2014-01-05 08:05:17 |
10517 |
compas.df <- compas.df %>% #chatgpt code help to only select 2 numeric columns
select(priors_count, age, race, sex)
#Changing string values for sex and race in the data to numeric values so that they can be used as part of the analysis
compas.df$sex <- as.factor(compas.df$sex)
compas.df$sex <- as.numeric(compas.df$sex)
compas.df$race<- as.factor(compas.df$race)
compas.df$race<- as.numeric(compas.df$race)
## Prepare data with removing NA and scaling
scaled.compas.df = compas.df %>%
drop_na() %>% # Removes rows with missing data
#select(-priors_count)
scale() %>% as.data.frame()
skim(scaled.compas.df)
Data summary
| Name |
scaled.compas.df |
| Number of rows |
11757 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| priors_count |
0 |
1 |
0 |
1 |
-0.66 |
-0.66 |
-0.44 |
0.20 |
8.52 |
▇▁▁▁▁ |
| age |
0 |
1 |
0 |
1 |
-1.43 |
-0.84 |
-0.26 |
0.65 |
5.06 |
▇▅▂▁▁ |
| race |
0 |
1 |
0 |
1 |
-0.89 |
-0.89 |
0.50 |
0.50 |
2.59 |
▇▆▂▁▁ |
| sex |
0 |
1 |
0 |
1 |
-1.96 |
0.51 |
0.51 |
0.51 |
0.51 |
▂▁▁▁▇ |
## Select a sample of the dataset to speed processing for exploratory analysis
sample.scaled.compas.df = scaled.compas.df %>% sample_n(size = 400)
## Internal metrics
internal.cl = clValid(sample.scaled.compas.df,
nClust = 2:10,
clMethods = c("kmeans","pam", "agnes", "diana"),
maxitems = 1000, # specifies the number of cases considered
validation = "internal")
## Warning in clValid(sample.scaled.compas.df, nClust = 2:10, clMethods =
## c("kmeans", : rownames for data not specified, using 1:nrow(data)
## View internal metrics
summary(internal.cl)
##
## Clustering Methods:
## kmeans pam agnes diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## kmeans Connectivity 57.3837 84.6595 87.9552 67.4298 83.4560 84.0433 99.3833 105.8353 103.4929
## Dunn 0.0268 0.0143 0.0173 0.0180 0.0389 0.0407 0.0433 0.0433 0.0433
## Silhouette 0.2095 0.2567 0.2833 0.3768 0.3467 0.3441 0.3439 0.3473 0.3977
## pam Connectivity 30.3782 20.6306 58.6563 84.6000 87.2790 85.6048 81.5952 93.1036 94.0627
## Dunn 0.0119 0.0127 0.0122 0.0149 0.0149 0.0155 0.0155 0.0210 0.0210
## Silhouette 0.2344 0.3184 0.3493 0.3550 0.3573 0.3682 0.3875 0.4057 0.4001
## agnes Connectivity 4.3004 4.4254 8.4794 14.5385 20.4476 21.3698 48.6893 51.4333 54.7651
## Dunn 0.1448 0.1593 0.1622 0.1622 0.1622 0.1622 0.0402 0.0402 0.0402
## Silhouette 0.4798 0.3793 0.3404 0.3284 0.3151 0.2894 0.3313 0.3435 0.3377
## diana Connectivity 6.0591 27.5853 31.8857 72.9746 79.6798 79.8048 87.9885 91.1353 92.4492
## Dunn 0.1207 0.0496 0.0519 0.0172 0.0193 0.0199 0.0200 0.0222 0.0238
## Silhouette 0.3518 0.3646 0.3336 0.3540 0.3550 0.3464 0.3528 0.3909 0.3908
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 4.3004 agnes 2
## Dunn 0.1622 agnes 4
## Silhouette 0.4798 agnes 2
plot(internal.cl)



#Agnes performs best with 2 clusters for silhouette and connectivity, for dunn agnes is slightly better with 3 clusters but 2 clusters is a close second so agnes with 2 clusters is best solution.
## Stability metrics
stability.cl = clValid(sample.scaled.compas.df,
nClust = 2:10,
clMethods = c("kmeans","pam", "agnes", "diana"),
maxitems = 1700, # specifies the number of cases considered
validation = "stability")
## Warning in clValid(sample.scaled.compas.df, nClust = 2:10, clMethods =
## c("kmeans", : rownames for data not specified, using 1:nrow(data)
## View stability metrics
summary(stability.cl)
##
## Clustering Methods:
## kmeans pam agnes diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## kmeans APN 0.1397 0.3167 0.3037 0.1710 0.2563 0.3584 0.4357 0.4519 0.3543
## AD 2.3837 2.2975 2.1864 1.8086 1.8031 1.7586 1.7055 1.6832 1.5485
## ADM 0.4452 1.0306 0.9894 0.7283 0.8701 0.8820 0.9905 0.9953 0.8967
## FOM 0.9763 0.9796 0.9812 0.9819 0.9803 0.9741 0.9690 0.9697 0.9716
## pam APN 0.2269 0.2531 0.2573 0.2586 0.2936 0.2947 0.3510 0.3930 0.3930
## AD 2.3938 2.0300 1.8512 1.7360 1.6902 1.6202 1.5517 1.4882 1.4502
## ADM 0.7922 0.7061 0.7571 0.7824 0.8483 0.8711 0.8361 0.8083 0.8035
## FOM 0.9844 0.9802 0.9732 0.9683 0.9681 0.9667 0.9607 0.9612 0.9591
## agnes APN 0.0474 0.0732 0.0415 0.0431 0.1180 0.2769 0.3228 0.3220 0.3953
## AD 2.4495 2.1531 2.0504 2.0201 1.9963 1.9903 1.8350 1.7992 1.7804
## ADM 0.1696 0.3678 0.3681 0.3715 0.4698 0.7340 0.8465 0.8614 0.9076
## FOM 0.9830 0.9821 0.9825 0.9829 0.9838 0.9823 0.9790 0.9797 0.9763
## diana APN 0.3381 0.3049 0.4776 0.3360 0.3479 0.3631 0.3411 0.2626 0.2819
## AD 2.5044 2.3795 2.3571 2.0782 1.9265 1.9066 1.8427 1.6614 1.5832
## ADM 1.0954 1.0890 1.2699 1.0594 1.0007 1.0077 0.9751 0.8941 0.8414
## FOM 0.9784 0.9765 0.9746 0.9748 0.9728 0.9737 0.9746 0.9716 0.9718
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0415 agnes 4
## AD 1.4502 pam 10
## ADM 0.1696 agnes 2
## FOM 0.9591 pam 10
plot(stability.cl)




compas.agnes = eclust(sample.scaled.compas.df,
FUNcluster = "agnes",
nboot = 200,
seed = 888)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
# Silhouette plot
fviz_silhouette(compas.agnes)
## cluster size ave.sil.width
## 1 1 34 0.39
## 2 2 37 0.24
## 3 3 53 0.08
## 4 4 89 0.61
## 5 5 79 0.39
## 6 6 44 0.26
## 7 7 37 0.56
## 8 8 18 0.41
## 9 9 9 0.58

#Cluster 4 has best fit
compas.agnes = eclust(sample.scaled.compas.df,
FUNcluster = "agnes",
k = 2 ,
hc_metric = "euclidean", hc_method = "ward.D2", # Distance metric and aglomeration method
seed = 888)
# Silhouette plot
fviz_silhouette(compas.agnes)
## cluster size ave.sil.width
## 1 1 78 0.42
## 2 2 322 0.35

# Dendrogam plot
fviz_dend(compas.agnes)

# Plot cluster membership in PCA space
fviz_cluster(compas.agnes)

## Apply umap to data
umap.df = umap(sample.scaled.compas.df, n_neighbors = 50, n_components = 2) %>% scale()
colnames(umap.df) = c("umap1", "umap2")
umap.df = as.data.frame(umap.df)
umap.plot = ggplot(umap.df, aes(umap1, umap2)) +
geom_point(size = .5) +
labs(title = "UMAP-transformed data") +
theme_bw()
umap.plot

sample.scaled.compas.df = cbind(sample.scaled.compas.df, umap.df)
## More components capture more information
umap4.df = umap(sample.scaled.compas.df, n_neighbors = 50, n_components = 4) %>% scale()
## Cluster based on UMAP data
internal.cl = clValid(umap4.df,
nClust = 2:15,
clMethods = c("kmeans", "pam", "agnes", "diana"),
maxitems = 1700,
validation = "internal")
## Warning in clValid(umap4.df, nClust = 2:15, clMethods = c("kmeans", "pam", :
## rownames for data not specified, using 1:nrow(data)
## View internal metrics
summary(internal.cl)
##
## Clustering Methods:
## kmeans pam agnes diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## kmeans Connectivity 0.0000 0.0000 7.9841 11.3762 12.4944 19.5901 26.2103 28.6071 48.1460 49.2373 52.4012 54.9413 63.3437 62.0829
## Dunn 0.7424 0.6979 0.0154 0.0249 0.0789 0.0463 0.0414 0.0481 0.0649 0.0307 0.0382 0.0384 0.0391 0.0391
## Silhouette 0.5927 0.7106 0.6963 0.6860 0.6749 0.6509 0.6356 0.6324 0.5854 0.5849 0.5428 0.5549 0.5364 0.5530
## pam Connectivity 0.0000 0.0000 3.3921 9.0417 12.4944 19.1762 22.3401 29.7647 45.5619 58.5857 67.5488 68.2972 61.0607 64.2091
## Dunn 0.7424 0.6979 0.0406 0.0454 0.0789 0.0492 0.0492 0.0338 0.0353 0.0390 0.0437 0.0483 0.0501 0.0509
## Silhouette 0.5927 0.7106 0.6661 0.6878 0.6749 0.6522 0.6100 0.5948 0.5412 0.5417 0.5343 0.5452 0.5666 0.5527
## agnes Connectivity 0.0000 0.0000 3.8913 7.2833 11.1496 12.3730 15.2131 20.4988 26.5647 34.6917 35.7012 36.4595 40.5647 45.5567
## Dunn 0.7424 0.6979 0.0312 0.0327 0.0546 0.0546 0.0737 0.0743 0.0809 0.0933 0.0933 0.0933 0.1128 0.0799
## Silhouette 0.5927 0.7106 0.6734 0.6635 0.6686 0.6433 0.6247 0.6215 0.5594 0.5736 0.5302 0.5408 0.5284 0.5374
## diana Connectivity 0.0000 0.0000 7.9841 11.3762 24.2675 25.9675 32.2944 44.3496 48.9571 52.1210 60.0000 68.7956 72.8397 78.8056
## Dunn 0.7424 0.6979 0.0154 0.0249 0.0270 0.0286 0.0367 0.0417 0.0481 0.0506 0.0518 0.0485 0.0520 0.0538
## Silhouette 0.5927 0.7106 0.6963 0.6860 0.6274 0.6391 0.6151 0.5676 0.5624 0.5202 0.5179 0.5093 0.4926 0.4786
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 0.0000 kmeans 2
## Dunn 0.7424 kmeans 2
## Silhouette 0.7106 kmeans 3
plot(internal.cl)



## Cluster based on UMAP data
stability.cl = clValid(umap4.df,
nClust = 2:15,
clMethods = c("kmeans", "pam", "agnes", "diana"),
maxitems = 1700,
validation = "stability")
## Warning in clValid(umap4.df, nClust = 2:15, clMethods = c("kmeans", "pam", :
## rownames for data not specified, using 1:nrow(data)
## View internal metrics
summary(stability.cl)
##
## Clustering Methods:
## kmeans pam agnes diana
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## kmeans APN 0.0000 0.0419 0.1029 0.0879 0.0591 0.0820 0.0768 0.0910 0.1437 0.1334 0.1213 0.1184 0.1644 0.1340
## AD 1.5863 0.8968 0.7446 0.5759 0.4085 0.3732 0.3396 0.3228 0.3137 0.2902 0.2568 0.2417 0.2309 0.2139
## ADM 0.0000 0.1555 0.2777 0.2213 0.0760 0.0905 0.0768 0.0730 0.1103 0.0941 0.0863 0.0830 0.0893 0.0739
## FOM 0.6582 0.4361 0.3950 0.3457 0.1609 0.1574 0.1489 0.1470 0.1462 0.1398 0.1319 0.1276 0.1121 0.1061
## pam APN 0.3018 0.0398 0.0989 0.0189 0.0576 0.1361 0.1528 0.1542 0.1870 0.1607 0.1559 0.1784 0.1324 0.1349
## AD 2.1369 0.8913 0.7635 0.4555 0.4078 0.3952 0.3604 0.3344 0.3148 0.2755 0.2544 0.2444 0.2218 0.2142
## ADM 1.2229 0.1460 0.2877 0.0239 0.0703 0.1484 0.1434 0.1415 0.1527 0.1095 0.0967 0.1015 0.0739 0.0744
## FOM 0.9780 0.4312 0.3710 0.1653 0.1634 0.1583 0.1494 0.1445 0.1386 0.1221 0.1126 0.1125 0.1078 0.1062
## agnes APN 0.0000 0.0341 0.0980 0.0840 0.0622 0.0705 0.0938 0.1165 0.1442 0.1289 0.1138 0.1278 0.1352 0.1078
## AD 1.5863 0.8784 0.7530 0.5940 0.4138 0.3811 0.3607 0.3372 0.3223 0.2946 0.2609 0.2476 0.2337 0.2179
## ADM 0.0000 0.1225 0.2298 0.2078 0.0798 0.0863 0.0865 0.0928 0.1204 0.0979 0.0893 0.0925 0.0853 0.0676
## FOM 0.6582 0.4180 0.3814 0.3425 0.1597 0.1547 0.1507 0.1493 0.1481 0.1412 0.1333 0.1259 0.1136 0.1082
## diana APN 0.1006 0.0374 0.1215 0.0747 0.1135 0.1327 0.1309 0.1654 0.1623 0.1812 0.1711 0.1681 0.1687 0.1776
## AD 1.7686 0.8856 0.7471 0.5737 0.4792 0.4023 0.3632 0.3439 0.3223 0.3095 0.2751 0.2487 0.2427 0.2351
## ADM 0.4071 0.1360 0.2913 0.2150 0.1621 0.1291 0.1168 0.1219 0.1108 0.1442 0.1139 0.0849 0.0882 0.0912
## FOM 0.8189 0.4257 0.3815 0.3462 0.2231 0.1594 0.1529 0.1500 0.1477 0.1459 0.1390 0.1221 0.1219 0.1191
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0000 kmeans 2
## AD 0.2139 kmeans 15
## ADM 0.0000 kmeans 2
## FOM 0.1061 kmeans 15
plot(stability.cl)




## UMAP and kmeans
umap.compas.kmean = eclust(umap.df,
FUNcluster = "kmeans",
k = 8,
seed = 888)

sample.scaled.compas.df = cbind(sample.scaled.compas.df, cluster = as.factor(umap.compas.kmean$cluster))
km_umap.plot =
ggplot(sample.scaled.compas.df, aes(umap1, umap2, colour = cluster)) +
geom_point(size = 1) +
labs(title = "Kmeans clustering based on UMAP transformed data", x = "", y = "") +
theme_bw() +
theme(legend.position = "none")
km_umap.plot

## Hierarchical density based clustering is very sensitive to the minimum number of points
sample.scaled.compas.df$hdb.cluster = hdbscan(as.matrix(umap4.df), minPts = 15)$cluster %>%
as.factor()
hdbscan.plot =
ggplot(sample.scaled.compas.df, aes(umap1, umap2, colour = hdb.cluster)) +
geom_point(size = 1) +
labs(title = "HDBscan clustering based on UMAP transformed data", x = "", y = "") +
theme_bw() +
theme(legend.position = "none")
hdbscan.plot

#There are 3 main clusters of data in the kmeans clustering graphs. 2 big clustters have the majority of the clusters, one cluster is by itself that seems to be an outlier. However, it illustrates that 3 clusters could work well for this data but more data is required to truly undersatnd the data and its relevance.